#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#' #Setup filenames

filename <- "Rwanda_Public Use" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

#' #Setup data, functions and create dictionary for dataset review
source (functions_vers)
#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

#' #Direct PII: variables to be removed
#' !!!No direct PII
#' 
#' #Direct PII-team: Encode interviewer names, which may be useful for analysis of interviewer effects
#' !!!Replace vector in "variables" field below with relevant variable names
#' 
# Encode Direct PII-team
#' !!!No direct PII-team
#' 
#' #Small locations: Encode locations  with pop <100,000 using random large numbers
#'  !!!Include relevant variables, but check their population size first to confirm they are <100,000
#' Remove redundant small location information

mydata <- mydata[!names(mydata) %in% c("nsector", 
                                       "sectordum1",
                                       "sectordum2",
                                       "sectordum3",
                                       "sectordum4",
                                       "sectordum5",
                                       "sectordum6",
                                       "sectordum7",
                                       "sectordum8",
                                       "sectordum9",
                                       "sectordum10",
                                       "sectordum11",
                                       "sectordum12",
                                       "sectordum13",
                                       "sectordum14",
                                       "sectordum15",
                                       "sectordum15")]

#' Relabel small locations

var_label(mydata$b_sectordum1) <- "Bl_Sector==1"
var_label(mydata$b_sectordum2) <- "Bl_Sector==2"
var_label(mydata$b_sectordum3) <- "Bl_Sector==3"
var_label(mydata$b_sectordum4) <- "Bl_Sector==4"
var_label(mydata$b_sectordum5) <- "Bl_Sector==5"
var_label(mydata$b_sectordum6) <- "Bl_Sector==6"
var_label(mydata$b_sectordum7) <- "Bl_Sector==7"
var_label(mydata$b_sectordum8) <- "Bl_Sector==8"
var_label(mydata$b_sectordum9) <- "Bl_Sector==9"
var_label(mydata$b_sectordum10) <- "Bl_Sector==10"
var_label(mydata$b_sectordum11) <- "Bl_Sector==11"
var_label(mydata$b_sectordum12) <- "Bl_Sector==12"
var_label(mydata$b_sectordum13) <- "Bl_Sector==13"
var_label(mydata$b_sectordum14) <- "Bl_Sector==14"
var_label(mydata$b_sectordum15) <- "Bl_Sector==15"

locvars <- c("e_v1d", "e_v4d") 
mydata <- encode_location (variables= locvars, missing=999999)

#' #Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" of 10 or less. 

break_edu <- c(1,3,9,15,88)
labels_edu <- c("1. No school, Kindergarten/Nursery" =1,
                "2. Primary (P1-P6)" = 2,
                "3. Secondary (S1-S6)" =3, 
                "4. Vocational training or College/University" =4,
                "5. Don't know" =5)
mydata <- ordinal_recode (variable="b_hh8", break_points=break_edu, missing=8888, value_labels=labels_edu)
mydata <- ordinal_recode (variable="b_hh9", break_points=break_edu, missing=8888, value_labels=labels_edu)

# Top code household composition variables with large and unusual numbers 

mydata <- top_recode ("b_hh_masked", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
mydata <- top_recode ("hh", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 

# TOp code number of household adults and household adults working.

mydata <- top_recode ("b_hh6_masked", break_point=6, missing=999999) # Topcode cases with 10 or more individuals. 
mydata <- top_recode ("b_hh7_masked", break_point=4, missing=999999) # Topcode cases with 10 or more individuals.

# Dates of birth removed, as strong identifier and ages already provided in separate variables

mydata <- mydata[!names(mydata) %in% c("b_d2_month", "e_d1a_month")]

# !!!Include relevant variables in list below

indirect_PII <- c("b_age_imputed",
                  "b_conditions",
                  "b_filliterate",
                  "b_fjunsecondary",
                  "b_fprimary",
                  "b_fsensecondary",
                  "b_ftertiary",
                  "b_harassment",
                  "b_hclprevalence",
                  "b_hclprevalence_project",
                  "b_hcltothrs",
                  "b_healthissues",
                  "b_hh_below15_masked",
                  "b_hrsconstruction",
                  "b_hrsdomestic",
                  "b_hrsfarming",
                  "b_hrsfire",
                  "b_hrshandcraft",
                  "b_hrslivestock",
                  "b_hrsmore100",
                  "b_hrsother",
                  "b_hrswashing",
                  "b_hrswater",
                  "b_hrsworked_tot",
                  "b_hrsworked_tot_nodom_masked",
                  "b_hw1_e_dum",
                  "b_hw1_f_dum",
                  "b_hw1_g_dum",
                  "b_hw1_h_dum",
                  "b_hw1_i_dum",
                  "b_hw1_j_dum",
                  "b_hw1_k_dum",
                  "b_hw1_l_dum",
                  "b_hw1_m_dum",
                  "b_hw1_n_dum",
                  "b_hw1_o_dum",
                  "b_hw1_p_dum",
                  "b_hw1_q_dum",
                  "b_hw1_r_dum",
                  "b_hw1_s_dum",
                  "b_hw1_t_dum",
                  "b_hw1_u_dum",
                  "b_hw1_v_dum",
                  "b_hw1_w_dum",
                  "b_hw1_x_dum",
                  "b_hw1_y_dum",
                  "b_hw1_z_dum",
                  "b_hw1_st_dum",
                  "b_hw1_yz_dum",
                  "b_i1_a_dum",
                  "b_i1_b_dum",
                  "b_i1_c_dum",
                  "b_i1_d_dum",
                  "b_i1_e_dum",
                  "b_i1_f_dum",
                  "b_i1_g_dum",
                  "b_i1_h_dum",
                  "b_i1_i_dum",
                  "b_i1_j_dum",
                  "b_i1_k_dum",
                  "b_i1_l_dum",
                  "b_i1_nonpoultry",
                  "b_institutions",
                  "b_interview_month",
                  "b_interview_year",
                  "b_lastjunsecondary",
                  "b_lastnosch",
                  "b_lastprimary",
                  "b_lastsensecondary",
                  "b_lastvoctraining",
                  "b_locations",
                  "b_machineryuse",
                  "b_mfs_agrwithoutprotection",
                  "b_milliterate",
                  "b_minor",
                  "b_mjunsecondary",
                  "b_mprimary",
                  "b_msensecondary",
                  "b_mtertiary",
                  "b_repetition",
                  "b_single",
                  "b_useproduct",
                  "e_activities",
                  "e_HRS_WKD",
                  "e_HRS_WKD2",
                  "e_HZAG",
                  "e_NO_REST_DAY",
                  "e_conditions",
                  "e_conditions2",
                  "e_harassment",
                  "e_healthissues",
                  "e_hlprevalence",
                  "e_institutions",
                  "e_locations",
                  "e_machineryuse",
                  "e_mfs_agwoprotect",
                  "e_minor",
                  "e_protectivegear",
                  "e_tot_hrs_pastweek_masked",
                  "e_useproduct",
                  "female",
                  "age_imputed",
                  "single",
                  "hw1m_d",
                  "hh1",
                  "hh5",
                  "i1d",
                  "i2",
                  "b_q17_ownphone",
                  "b_d1_female",
                  "b_d2_year",
                  "b_d3",
                  "b_d4",
                  "b_d4a",
                  "b_d4b",
                  "b_d4b_days",
                  "b_d5a",
                  "b_d5b",
                  "b_d5c",
                  "b_w1a",
                  "b_w1a_work",
                  "b_w1a1",
                  "b_w1a2",
                  "b_w1a3",
                  "b_w1b",
                  "b_w1b1",
                  "b_w1b2",
                  "b_w1b3",
                  "b_w1c",
                  "b_w1c1",
                  "b_w1c2",
                  "b_w1c3",
                  "b_w1d",
                  "b_w1d1",
                  "b_w1d2",
                  "b_w1d3",
                  "b_w1e",
                  "b_w1e1",
                  "b_w1e2",
                  "b_w1e3",
                  "b_w1f",
                  "b_w1f1",
                  "b_w1f2",
                  "b_w1f3",
                  "b_w2a",
                  "b_w2b",
                  "b_w2c",
                  "b_w2d",
                  "b_w2e",
                  "b_w2f",
                  "b_w2g",
                  "b_w2h",
                  "b_w2i",
                  "b_w2_other",
                  "b_w3_1a",
                  "b_w3_1b",
                  "b_w3_1c",
                  "b_w3_1d",
                  "b_w3_1e_masked",
                  "b_w3_2a",
                  "b_w3_2b",
                  "b_w3_2c",
                  "b_w3_2d",
                  "b_w3_2e_masked",
                  "b_w3_3a",
                  "b_w3_3b",
                  "b_w3_3c",
                  "b_w3_3d",
                  "b_w3_3e_masked",
                  "b_w3_4a",
                  "b_w3_4b",
                  "b_w3_4c",
                  "b_w3_4d",
                  "b_w3_4e_masked",
                  "b_w3_5a",
                  "b_w3_5b",
                  "b_w3_5c",
                  "b_w3_5d",
                  "b_w3_5e_masked",
                  "b_w3_6a",
                  "b_w3_6b",
                  "b_w3_6c",
                  "b_w3_6d",
                  "b_w3_6e_masked",
                  "b_w3_7a",
                  "b_w3_7b",
                  "b_w3_7c",
                  "b_w3_7d",
                  "b_w3_7e_masked",
                  "b_w3_8a",
                  "b_w3_8b",
                  "b_w3_8c",
                  "b_w3_8d",
                  "b_w3_8e_masked",
                  "b_w3_9a",
                  "b_w3_9b",
                  "b_w3_9c",
                  "b_w3_9d",
                  "b_w3_9e_masked",
                  "b_w3_10a",
                  "b_w3_10b",
                  "b_w3_10c",
                  "b_w3_10d",
                  "b_w3_10e_masked",
                  "b_w3_11",
                  "b_w3_12",
                  "b_w3_13",
                  "b_w3_14",
                  "b_w3_15",
                  "b_w3_16",
                  "b_w3_17",
                  "b_w3_21",
                  "b_w3_22",
                  "b_w3_23",
                  "b_w3_24",
                  "b_w3_25",
                  "b_w3_26",
                  "b_w3_27",
                  "b_w3_31",
                  "b_w3_32",
                  "b_w3_33",
                  "b_w3_34",
                  "b_w3_35",
                  "b_w3_36",
                  "b_w3_37",
                  "b_w3_41",
                  "b_w3_42",
                  "b_w3_43",
                  "b_w3_44",
                  "b_w3_45",
                  "b_w3_46",
                  "b_w3_47",
                  "b_w3_51",
                  "b_w3_52",
                  "b_w3_53",
                  "b_w3_54",
                  "b_w3_55",
                  "b_w3_56",
                  "b_w3_57",
                  "b_w3_61",
                  "b_w3_62",
                  "b_w3_63",
                  "b_w3_64",
                  "b_w3_65",
                  "b_w3_66",
                  "b_w3_67",
                  "b_w3_71",
                  "b_w3_72",
                  "b_w3_73",
                  "b_w3_74",
                  "b_w3_75",
                  "b_w3_76",
                  "b_w3_77",
                  "b_w3_81",
                  "b_w3_82",
                  "b_w3_83",
                  "b_w3_84",
                  "b_w3_85",
                  "b_w3_86",
                  "b_w3_87",
                  "b_w3_91",
                  "b_w3_92",
                  "b_w3_93",
                  "b_w3_94",
                  "b_w3_95",
                  "b_w3_96",
                  "b_w3_97",
                  "b_w3_101",
                  "b_w3_102",
                  "b_w3_103",
                  "b_w3_104",
                  "b_w3_105",
                  "b_w3_106",
                  "b_w3_107",
                  "b_w4b_1",
                  "b_w4b_2",
                  "b_w4b_3",
                  "b_w4b_4",
                  "b_w4b_5",
                  "b_w4b_6",
                  "b_w4b_7",
                  "b_w4b_8",
                  "b_w4b_9",
                  "b_w4b_10",
                  "b_w4b_other",
                  "b_w5_1",
                  "b_w5_2",
                  "b_w5_3",
                  "b_w5_4",
                  "b_w5_5",
                  "b_w5_6",
                  "b_w6_1",
                  "b_w6_2",
                  "b_w6_3",
                  "b_w6_4",
                  "b_w6_5",
                  "b_w6_6",
                  "b_w6_7",
                  "b_w7_1",
                  "b_w7_2",
                  "b_hw1_a",
                  "b_hw1_b",
                  "b_hw1_c",
                  "b_hw1_d",
                  "b_hw1_e",
                  "b_hw1_f",
                  "b_hw1_farming",
                  "b_hw1_g",
                  "b_hw1_h",
                  "b_hw1_i",
                  "b_hw1_j",
                  "b_hw1_k",
                  "b_hw1_l",
                  "b_hw1_m",
                  "b_hw1_n",
                  "b_hw1_o",
                  "b_hw1_oth",
                  "b_hw1_p",
                  "b_hw1_q",
                  "b_hw1_r",
                  "b_hw1_s",
                  "b_hw1_t",
                  "b_hw1_u",
                  "b_hw1_v",
                  "b_hw1_w",
                  "b_hw1_x",
                  "b_hw1_y",
                  "b_hw1_z",
                  "b_hw2_a",
                  "b_hw2_b",
                  "b_hw2_c",
                  "b_hw2_d",
                  "b_hw2_e",
                  "b_hw2_f",
                  "b_hw2_g",
                  "b_hw2_h",
                  "b_hw2_i",
                  "b_hw2_j",
                  "b_hw2_k",
                  "b_hw2_l",
                  "b_hw2_m",
                  "b_hw2_n",
                  "b_hw2_o",
                  "b_hw3_a",
                  "b_hw3_b",
                  "b_hw3_c",
                  "b_hw3_d",
                  "b_hw3_e",
                  "b_hw3_f",
                  "b_hw3_g",
                  "b_hw3_h",
                  "b_hw3_i",
                  "b_hw3_j",
                  "b_hw3_k",
                  "b_hw3_l",
                  "b_hw3_m",
                  "b_hw4_a",
                  "b_hw4_b",
                  "b_hw4_c",
                  "b_hw4_d",
                  "b_hw4_e",
                  "b_hw4_f",
                  "b_hw4_g",
                  "b_hw4_h",
                  "b_hw4_i",
                  "b_hw4_j",
                  "b_hw4_k",
                  "b_hw4_l",
                  "b_hw4_m",
                  "b_hw4_n_other1",
                  "b_hw4_n_other2",
                  "b_hw5_a",
                  "b_hw5_b",
                  "b_hw5_c",
                  "b_hw5_d",
                  "b_hh1_masked",
                  "b_hh2_masked",
                  "b_hh3_masked",
                  "b_hh4_masked",
                  "b_hh5",
                  "b_i1_b_masked",
                  "b_i1_c_masked",
                  "b_i1_d_masked",
                  "b_i1_e_masked",
                  "b_i1_f_masked",
                  "b_i1_g_masked",
                  "b_i1_h_masked",
                  "b_i1_i_masked",
                  "b_i1_j_masked",
                  "b_i1_k_masked",
                  "b_i1_l_masked",
                  "b_i2",
                  "b_i2plots_masked",
                  "b_i3plots_masked",
                  "b_ci21",
                  "e_interview_month",
                  "e_interview_year",
                  "e_q14",
                  "e_d1a_year_masked",
                  "e_d1b_masked",
                  "e_d2",
                  "e_d3a",
                  "e_d3b",
                  "e_d3bsp_masked",
                  "e_w1a_tea",
                  "e_w1a_coff",
                  "e_w1a_rice",
                  "e_w1b_tea",
                  "e_w1b_coff",
                  "e_w1b_rice",
                  "e_w1c_a",
                  "e_w1c_b",
                  "e_w1c_c",
                  "e_w1c_d",
                  "e_w1c_e",
                  "e_w1c_f",
                  "e_w1c_g",
                  "e_w1c_h",
                  "e_w1c_i",
                  "e_w1c_j",
                  "e_w1c_k",
                  "e_w1c_l",
                  "e_w1d1a",
                  "e_w1d1b",
                  "e_w1d1c",
                  "e_w1d1d",
                  "e_w1d1e",
                  "e_w1d1f",
                  "e_w1d1g",
                  "e_w1d1h",
                  "e_w1d1i",
                  "e_w1d1j",
                  "e_w1d1k",
                  "e_w1d1l",
                  "e_w1d1m",
                  "e_w1d1n",
                  "e_w1d1o",
                  "e_w1d1p",
                  "e_w1d1q",
                  "e_w1d1r",
                  "e_w1d1s",
                  "e_w1d1t",
                  "e_w1d1u",
                  "e_w1d1v",
                  "e_w1d1z",
                  "e_w1d2a",
                  "e_w1d2b",
                  "e_w1d2c",
                  "e_w1d2d",
                  "e_w1d2e",
                  "e_w1d2f",
                  "e_w1d2g",
                  "e_w1d2h",
                  "e_w1d2i",
                  "e_w1d2j",
                  "e_w1d2k",
                  "e_w1d2l",
                  "e_w1d2m",
                  "e_w1d2n",
                  "e_w1d2o",
                  "e_w1d2p",
                  "e_w1d2q",
                  "e_w1d2r",
                  "e_w1d2s",
                  "e_w1d2t",
                  "e_w1d2u",
                  "e_w1d2v",
                  "e_w1d2z",
                  "e_w2a1",
                  "e_w2a2",
                  "e_w2a3",
                  "e_w2a4",
                  "e_w2a5",
                  "e_w2a6",
                  "e_w2a7",
                  "e_w2b_masked",
                  "e_w2c_masked",
                  "e_w2d",
                  "e_w3a",
                  "e_w3b",
                  "e_w3c",
                  "e_w3d",
                  "e_w3e",
                  "e_w3f",
                  "e_hw1a",
                  "e_hw1b",
                  "e_hw1c",
                  "e_hw1d",
                  "e_hw1e",
                  "e_hw1f",
                  "e_hw2a",
                  "e_hw2b",
                  "e_hw2c",
                  "e_hw2d",
                  "e_hw2e",
                  "e_hw2f",
                  "e_hw2g",
                  "e_hw2h",
                  "e_hw2i",
                  "e_hw2j",
                  "e_hw2k",
                  "e_hw2l",
                  "e_hw2m",
                  "e_hw2n",
                  "e_hw2o",
                  "e_hw3a",
                  "e_hw3b",
                  "e_hw3c",
                  "e_hw3d",
                  "e_hw3e",
                  "e_hw3f",
                  "e_hw3g",
                  "e_hw3h",
                  "e_hw3i",
                  "e_hw3j",
                  "e_hw3k",
                  "e_hw3l",
                  "e_hw3m",
                  "e_hw4a",
                  "e_hw4b",
                  "e_hw4c",
                  "e_hw4d",
                  "e_hw4e",
                  "e_hw4f",
                  "e_hw4g",
                  "e_hw4h",
                  "e_hw4i",
                  "e_hw4j",
                  "e_hw4k",
                  "e_hw4l",
                  "e_hw4m",
                  "e_hw5a",
                  "e_hw5b",
                  "e_hw5c",
                  "e_hw5d")

capture_tables (indirect_PII)

#' #Matching and crosstabulations: Run automated PII check 

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age

mydata$educ <- mydata$b_d5a
mydata$educ[is.na(mydata$educ)] <- mydata$b_d4a[is.na(mydata$educ)]
selectedKeyVars = c('female', 'age_imputed', 'educ') ##!!! Replace with candidate categorical demo vars

# weight variable
# selectedWeightVar = c('projwt') ##!!! Replace with weight var

# household id variable (cluster)
# selectedHouseholdID = c('wpid') ##!!! Replace with household id

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
#' !!!No records violate 2-anonymity

#' #Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
open_ends <- c("b_d3_other", "e_v3a_other_transl","e_v4a_other_transl")
report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number

mydata <- mydata[!names(mydata) %in% "b_d3_other"] # Drop as actually verbatim data in Kinyarwanda.
mydata$e_v4a_other_transl[3] <- "[Kinyarwanda]" # Drop as actually verbatim data in Kinyarwanda.
mydata$e_v4a_other_transl[4] <- "[Location]" # Redrafted as small location appears.
mydata$e_v4a_other_transl[15] <- "[Kinyarwanda]" # Drop as actually verbatim data in Kinyarwanda.
mydata$e_v4a_other_transl[18] <- "[Kinyarwanda]" # Drop as actually verbatim data in Kinyarwanda.
mydata$e_v4a_other_transl[16] <- "[School]" # Redrafted as school name appears.

#' #GPS data: Displace
# !!! No GPS

#' #Save processed data in Stata and SPSS format
#' Adds "_PU" (Public Use) to the end of the name 

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
